{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Connect and harvest data from Twitter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import twitter\n",
    "import urlparse\n",
    "import pandas as pd\n",
    "\n",
    "# parallel print\n",
    "from pprint import pprint as pp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([u'consumer_key', u' consumer_secret', u'access_token',\n",
       "       u'access_secret'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the twitter API keys\n",
    "twitter_tokens = pd.read_csv(\"../twitter_tokens.csv\")\n",
    "twitter_tokens.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "class TwitterAPI(object):\n",
    "    \"\"\"\n",
    "        TwitterAPI class allows the Connection to Twitter via OAuth\n",
    "        once you have registered with Twitter and receive the\n",
    "        necessary credentials.\n",
    "    \"\"\"\n",
    "    # Initialize key variables and get the twitter credentials\n",
    "    def __init__(self):\n",
    "        consumer_key = twitter_tokens.values.flatten()[0]\n",
    "        consumer_secret = twitter_tokens.values.flatten()[1]\n",
    "        access_token = twitter_tokens.values.flatten()[2]\n",
    "        access_secret = twitter_tokens.values.flatten()[3]\n",
    "        \n",
    "        self.consumer_key = consumer_key\n",
    "        self.consumer_secret = consumer_secret\n",
    "        self.access_token = access_token\n",
    "        self.access_secret = access_secret\n",
    "        \n",
    "    # Authenticate credentials with Twitter using OAuth\n",
    "        self.auth = twitter.oauth.OAuth(access_token, access_secret, \n",
    "                                        consumer_key, consumer_secret)\n",
    "        \n",
    "        \n",
    "    # Create registered Twitter API\n",
    "        self.api = twitter.Twitter(auth=self.auth)\n",
    "        \n",
    "        \n",
    "    # Search Twitter with query q (i.e \"ApacheSpark\") and max result\n",
    "    def searchTwitter(self, q, max_res=10, **kwargs):\n",
    "        search_results = self.api.search.tweets(q=q, count=10, **kwargs)\n",
    "        statuses = search_results['statuses']\n",
    "        max_results = min(1000, max_res)\n",
    "        \n",
    "        for _ in range(10):\n",
    "            try:\n",
    "                next_results = search_results['search_metadata']['next_results']\n",
    "            except KeyError as e:\n",
    "                break\n",
    "            \n",
    "            next_results = urlparse.parse_qsl(next_results[1:])\n",
    "            kwargs = dict(next_results)\n",
    "            \n",
    "            search_results = self.api.search.tweets(**kwargs)\n",
    "            statuses += search_results['statuses']\n",
    "            \n",
    "            if len(statuses) > max_results:\n",
    "                break\n",
    "            \n",
    "        return statuses\n",
    "    \n",
    "    \n",
    "    \n",
    "    # Parse tweets as it is collected to extract ID, creation date, userID, tweet text\n",
    "    def parseTweets(self, statuses):\n",
    "        tweetx = [(status['id'],\n",
    "                   status['created_at'],\n",
    "                   status['user']['id'],\n",
    "                   status['user']['name'],\n",
    "                   url['expanded_url'],\n",
    "                   status['text']) \n",
    "                    for status in statuses \n",
    "                      for url in status['entities']['urls']\n",
    "                 ]\n",
    "        return tweetx\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Instantiate the class with the required authentication\n",
    "obj = TwitterAPI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Run a query on the search tern\n",
    "twtx = obj.searchTwitter(\"ApacheSpark\")\n",
    "\n",
    "# Parse the tweets\n",
    "parsed_tweetx = obj.parseTweets(twtx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lenth of parsed tweets: 18 \n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>created_at</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_name</th>\n",
       "      <th>tweet_text</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>717857494886912002</td>\n",
       "      <td>Wed Apr 06 23:32:18 +0000 2016</td>\n",
       "      <td>2809658384</td>\n",
       "      <td>AmazingOpenSource</td>\n",
       "      <td>http://snip.ly/wkqtc</td>\n",
       "      <td>RT @Talend: Why #ApacheSpark is Critical to #D...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>717857388515164162</td>\n",
       "      <td>Wed Apr 06 23:31:53 +0000 2016</td>\n",
       "      <td>1014383496</td>\n",
       "      <td>Steve Tranchida</td>\n",
       "      <td>http://bit.ly/1V6Lxj4</td>\n",
       "      <td>RT @hortonworks: Learn more about #ApacheSpark...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>717857385834872832</td>\n",
       "      <td>Wed Apr 06 23:31:52 +0000 2016</td>\n",
       "      <td>114666114</td>\n",
       "      <td>Ric</td>\n",
       "      <td>http://bit.ly/1V6Lxj4</td>\n",
       "      <td>RT @hortonworks: Learn more about #ApacheSpark...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   id                      created_at     user_id  \\\n",
       "0  717857494886912002  Wed Apr 06 23:32:18 +0000 2016  2809658384   \n",
       "1  717857388515164162  Wed Apr 06 23:31:53 +0000 2016  1014383496   \n",
       "2  717857385834872832  Wed Apr 06 23:31:52 +0000 2016   114666114   \n",
       "\n",
       "           user_name             tweet_text  \\\n",
       "0  AmazingOpenSource   http://snip.ly/wkqtc   \n",
       "1    Steve Tranchida  http://bit.ly/1V6Lxj4   \n",
       "2                Ric  http://bit.ly/1V6Lxj4   \n",
       "\n",
       "                                                 url  \n",
       "0  RT @Talend: Why #ApacheSpark is Critical to #D...  \n",
       "1  RT @hortonworks: Learn more about #ApacheSpark...  \n",
       "2  RT @hortonworks: Learn more about #ApacheSpark...  "
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Display output of parsed tweets\n",
    "print(\"Lenth of parsed tweets: {} \\n\\n\".format(len(parsed_tweetx)))\n",
    "\n",
    "# Serialize the data into CSV\n",
    "csv_fields = ['id', 'created_at', 'user_id', 'user_name', 'tweet_text', 'url']\n",
    "tweets_2frames = pd.DataFrame(parsed_tweetx, columns=csv_fields)\n",
    "tweets_2frames.to_csv(\"tweets.csv\", encoding='utf-8')\n",
    "\n",
    "# Display first 3 rows\n",
    "tweets_2frames.ix[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'RT @hortonworks: Learn more about #ApacheSpark using Scala in this online training class -  https://t.co/GR8Bp7hOvc'"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "tweets_2frames.url[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
